suppressWarnings(library(tidyverse))
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
suppressWarnings(library(leaflet))
suppressWarnings(library(cluster))
suppressWarnings(library(factoextra))
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
suppressWarnings(library(glmnet))
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
##
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
##
## Loaded glmnet 4.1-8
suppressWarnings(library(PerformanceAnalytics))
## Loading required package: xts
## Loading required package: zoo
##
## Attaching package: 'zoo'
##
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
##
##
## ######################### Warning from 'xts' package ##########################
## # #
## # The dplyr lag() function breaks how base R's lag() function is supposed to #
## # work, which breaks lag(my_xts). Calls to lag(my_xts) that you type or #
## # source() into this session won't work correctly. #
## # #
## # Use stats::lag() to make sure you're not using dplyr::lag(), or you can add #
## # conflictRules('dplyr', exclude = 'lag') to your .Rprofile to stop #
## # dplyr from breaking base R's lag() function. #
## # #
## # Code in packages is not affected. It's protected by R's namespace mechanism #
## # Set `options(xts.warn_dplyr_breaks_lag = FALSE)` to suppress this warning. #
## # #
## ###############################################################################
##
## Attaching package: 'xts'
##
## The following object is masked from 'package:leaflet':
##
## addLegend
##
## The following objects are masked from 'package:dplyr':
##
## first, last
##
##
## Attaching package: 'PerformanceAnalytics'
##
## The following object is masked from 'package:graphics':
##
## legend
# Cargar los datasets
train <- read.csv("train.csv")
test <- read.csv("test.csv")
# Imputar valores nulos en total_bedrooms
train$total_bedrooms[is.na(train$total_bedrooms)] <- median(train$total_bedrooms, na.rm = TRUE)
test$total_bedrooms[is.na(test$total_bedrooms)] <- median(test$total_bedrooms, na.rm = TRUE)
ggplot(train, aes(x = longitude, y = latitude, color = ocean_proximity)) +
geom_point() +
labs(title = "Mapa de precios de casas por coordenadas",
x = "Longitud",
y = "Latitud",
color = "Cercania")
ggplot(train, aes(x = longitude, y = latitude, color = median_house_value)) +
geom_point() +
scale_color_gradient(low = "blue", high = "red") +
labs(title = "Mapa de precios de casas por coordenadas",
x = "Longitud",
y = "Latitud",
color = "Precio de la Casa")
m <- leaflet(train) %>%
addTiles() %>%
addCircleMarkers(~longitude, ~latitude, radius = ~median_house_value/100000,
color = ~ifelse(median_house_value > median(train$median_house_value), 'red', 'blue'),
popup = ~paste("median_house_value:", median_house_value))
m
train$ocean_proximity_num <- as.numeric(as.factor(train$ocean_proximity))
# Calcular la silueta promedio para k = 2 a 10
k_values <- 2:15
fviz_nbclust(train[, c("longitude", "latitude", "ocean_proximity_num")], kmeans, method = "wss")
calcular_silueta_promedio <- function(data, k) {
set.seed(42)
kmeans_result <- kmeans(data, centers = k)
silueta <- silhouette(kmeans_result$cluster, dist(data))
mean(silueta[, 3]) # Promedio del coeficiente de silueta
}
silueta_promedio <- sapply(k_values, function(k) calcular_silueta_promedio(train[, c("longitude", "latitude", "ocean_proximity_num")], k))
# Visualizar los resultados
plot(k_values, silueta_promedio, type = "b", pch = 19,
xlab = "Número de Clusters (k)", ylab = "Coeficiente de Silueta Promedio",
main = "Coeficiente de Silueta Promedio para Diferentes k")
num_clusters <- 5
clusters <- kmeans(train[, c("longitude", "latitude", "ocean_proximity_num")], centers = num_clusters)
train$position_cluster <- as.factor(clusters$cluster)
ggplot(train, aes(x = longitude, y = latitude, color = position_cluster)) +
geom_point() +
labs(title = "Clustering de casas por coordenadas",
x = "Longitud",
y = "Latitud",
color = "Cluster")
continuas = names(train)[sapply(train, is.numeric) & names(train) != "median_house_value" & names(train) != "id" & 30 < sapply(train, n_distinct) ]
continuas
## [1] "longitude" "latitude" "housing_median_age"
## [4] "total_rooms" "total_bedrooms" "population"
## [7] "households" "median_income"
for (column_name in continuas) {
vector_numerico = unlist(train[column_name])
Q1 <- quantile(vector_numerico, 0.25) # Primer cuartil (Q1)
Q3 <- quantile(vector_numerico, 0.75) # Tercer cuartil (Q3)
IQR <- Q3 - Q1 # Rango intercuartÃlico
# Definir los lÃmites inferior y superior para identificar outliers
limite_inferior <- Q1 - 1.5 * IQR
limite_superior <- Q3 + 1.5 * IQR
# Identificar los outliers
outliers <- vector_numerico[vector_numerico < limite_inferior | vector_numerico > limite_superior]
# Contar los outliers
conteo_outliers <- length(outliers)
# Resultados
print(column_name)
print(paste("Número de outliers identificados usando IQR:", conteo_outliers))
print(paste("Prosnetaje de outliers identificados usando IQR:", conteo_outliers/nrow(train)))
}
## [1] "longitude"
## [1] "Número de outliers identificados usando IQR: 0"
## [1] "Prosnetaje de outliers identificados usando IQR: 0"
## [1] "latitude"
## [1] "Número de outliers identificados usando IQR: 0"
## [1] "Prosnetaje de outliers identificados usando IQR: 0"
## [1] "housing_median_age"
## [1] "Número de outliers identificados usando IQR: 0"
## [1] "Prosnetaje de outliers identificados usando IQR: 0"
## [1] "total_rooms"
## [1] "Número de outliers identificados usando IQR: 912"
## [1] "Prosnetaje de outliers identificados usando IQR: 0.0631272928635703"
## [1] "total_bedrooms"
## [1] "Número de outliers identificados usando IQR: 914"
## [1] "Prosnetaje de outliers identificados usando IQR: 0.0632657299093237"
## [1] "population"
## [1] "Número de outliers identificados usando IQR: 861"
## [1] "Prosnetaje de outliers identificados usando IQR: 0.0595971481968575"
## [1] "households"
## [1] "Número de outliers identificados usando IQR: 859"
## [1] "Prosnetaje de outliers identificados usando IQR: 0.059458711151104"
## [1] "median_income"
## [1] "Número de outliers identificados usando IQR: 486"
## [1] "Prosnetaje de outliers identificados usando IQR: 0.0336402021180868"
suppressWarnings(chart.Correlation(train[continuas], histogram=TRUE))
remove_outliers <- function(df, columns) {
for (col in columns) {
Q1 <- quantile(df[[col]], 0.25)
Q3 <- quantile(df[[col]], 0.75)
IQR <- Q3 - Q1
df <- df[!(df[[col]] < (Q1 - 1.5 * IQR) | df[[col]] > (Q3 + 1.5 * IQR)), ]
}
return(df)
}
train_no_outliers <- remove_outliers(train, continuas)
cat("Tamaño del dataset train_processed original: ", nrow(train), "\n")
## Tamaño del dataset train_processed original: 14447
cat("Tamaño del dataset train_processed sin outliers: ", nrow(train_no_outliers), "\n")
## Tamaño del dataset train_processed sin outliers: 12272
for (column_name in continuas) {
vector_numerico = unlist(train_no_outliers[column_name])
Q1 <- quantile(vector_numerico, 0.25) # Primer cuartil (Q1)
Q3 <- quantile(vector_numerico, 0.75) # Tercer cuartil (Q3)
IQR <- Q3 - Q1 # Rango intercuartÃlico
# Definir los lÃmites inferior y superior para identificar outliers
limite_inferior <- Q1 - 1.5 * IQR
limite_superior <- Q3 + 1.5 * IQR
# Identificar los outliers
outliers <- vector_numerico[vector_numerico < limite_inferior | vector_numerico > limite_superior]
# Contar los outliers
conteo_outliers <- length(outliers)
# Resultados
print(column_name)
print(paste("Número de outliers identificados usando IQR:", conteo_outliers))
print(paste("Prosnetaje de outliers identificados usando IQR:", conteo_outliers/nrow(train)))
}
## [1] "longitude"
## [1] "Número de outliers identificados usando IQR: 0"
## [1] "Prosnetaje de outliers identificados usando IQR: 0"
## [1] "latitude"
## [1] "Número de outliers identificados usando IQR: 0"
## [1] "Prosnetaje de outliers identificados usando IQR: 0"
## [1] "housing_median_age"
## [1] "Número de outliers identificados usando IQR: 0"
## [1] "Prosnetaje de outliers identificados usando IQR: 0"
## [1] "total_rooms"
## [1] "Número de outliers identificados usando IQR: 178"
## [1] "Prosnetaje de outliers identificados usando IQR: 0.0123208970720565"
## [1] "total_bedrooms"
## [1] "Número de outliers identificados usando IQR: 53"
## [1] "Prosnetaje de outliers identificados usando IQR: 0.00366858171246626"
## [1] "population"
## [1] "Número de outliers identificados usando IQR: 68"
## [1] "Prosnetaje de outliers identificados usando IQR: 0.00470685955561708"
## [1] "households"
## [1] "Número de outliers identificados usando IQR: 10"
## [1] "Prosnetaje de outliers identificados usando IQR: 0.000692185228767218"
## [1] "median_income"
## [1] "Número de outliers identificados usando IQR: 93"
## [1] "Prosnetaje de outliers identificados usando IQR: 0.00643732262753513"
suppressWarnings(chart.Correlation(train_no_outliers[continuas], histogram=TRUE))
## Metodos Descartados - Seleccion de Variables